import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
df = pd.read_csv('owid-covid-data.csv')
df
| iso_code | continent | location | date | total_cases | new_cases | new_cases_smoothed | total_deaths | new_deaths | new_deaths_smoothed | ... | female_smokers | male_smokers | handwashing_facilities | hospital_beds_per_thousand | life_expectancy | human_development_index | excess_mortality_cumulative_absolute | excess_mortality_cumulative | excess_mortality | excess_mortality_cumulative_per_million | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AFG | Asia | Afghanistan | 2020-02-24 | 5.0 | 5.0 | NaN | NaN | NaN | NaN | ... | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | NaN | NaN | NaN | NaN |
| 1 | AFG | Asia | Afghanistan | 2020-02-25 | 5.0 | 0.0 | NaN | NaN | NaN | NaN | ... | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | NaN | NaN | NaN | NaN |
| 2 | AFG | Asia | Afghanistan | 2020-02-26 | 5.0 | 0.0 | NaN | NaN | NaN | NaN | ... | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | NaN | NaN | NaN | NaN |
| 3 | AFG | Asia | Afghanistan | 2020-02-27 | 5.0 | 0.0 | NaN | NaN | NaN | NaN | ... | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | NaN | NaN | NaN | NaN |
| 4 | AFG | Asia | Afghanistan | 2020-02-28 | 5.0 | 0.0 | NaN | NaN | NaN | NaN | ... | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 166321 | ZWE | Africa | Zimbabwe | 2022-03-01 | 236871.0 | 491.0 | 413.000 | 5395.0 | 0.0 | 1.000 | ... | 1.6 | 30.7 | 36.791 | 1.7 | 61.49 | 0.571 | NaN | NaN | NaN | NaN |
| 166322 | ZWE | Africa | Zimbabwe | 2022-03-02 | 237503.0 | 632.0 | 416.286 | 5396.0 | 1.0 | 1.143 | ... | 1.6 | 30.7 | 36.791 | 1.7 | 61.49 | 0.571 | NaN | NaN | NaN | NaN |
| 166323 | ZWE | Africa | Zimbabwe | 2022-03-03 | 237503.0 | 0.0 | 362.286 | 5396.0 | 0.0 | 0.857 | ... | 1.6 | 30.7 | 36.791 | 1.7 | 61.49 | 0.571 | NaN | NaN | NaN | NaN |
| 166324 | ZWE | Africa | Zimbabwe | 2022-03-04 | 238739.0 | 1236.0 | 467.429 | 5397.0 | 1.0 | 0.714 | ... | 1.6 | 30.7 | 36.791 | 1.7 | 61.49 | 0.571 | NaN | NaN | NaN | NaN |
| 166325 | ZWE | Africa | Zimbabwe | 2022-03-05 | 239019.0 | 280.0 | 459.429 | 5397.0 | 0.0 | 0.571 | ... | 1.6 | 30.7 | 36.791 | 1.7 | 61.49 | 0.571 | NaN | NaN | NaN | NaN |
166326 rows × 67 columns
df.head()
| iso_code | continent | location | date | total_cases | new_cases | new_cases_smoothed | total_deaths | new_deaths | new_deaths_smoothed | ... | female_smokers | male_smokers | handwashing_facilities | hospital_beds_per_thousand | life_expectancy | human_development_index | excess_mortality_cumulative_absolute | excess_mortality_cumulative | excess_mortality | excess_mortality_cumulative_per_million | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AFG | Asia | Afghanistan | 2020-02-24 | 5.0 | 5.0 | NaN | NaN | NaN | NaN | ... | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | NaN | NaN | NaN | NaN |
| 1 | AFG | Asia | Afghanistan | 2020-02-25 | 5.0 | 0.0 | NaN | NaN | NaN | NaN | ... | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | NaN | NaN | NaN | NaN |
| 2 | AFG | Asia | Afghanistan | 2020-02-26 | 5.0 | 0.0 | NaN | NaN | NaN | NaN | ... | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | NaN | NaN | NaN | NaN |
| 3 | AFG | Asia | Afghanistan | 2020-02-27 | 5.0 | 0.0 | NaN | NaN | NaN | NaN | ... | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | NaN | NaN | NaN | NaN |
| 4 | AFG | Asia | Afghanistan | 2020-02-28 | 5.0 | 0.0 | NaN | NaN | NaN | NaN | ... | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | NaN | NaN | NaN | NaN |
5 rows × 67 columns
df.isna().sum()
iso_code 0
continent 9956
location 0
date 0
total_cases 3033
...
human_development_index 30073
excess_mortality_cumulative_absolute 160630
excess_mortality_cumulative 160630
excess_mortality 160630
excess_mortality_cumulative_per_million 160630
Length: 67, dtype: int64
for i in df:
if df[i].isna().sum() >= 20000 and i != "total_deaths":
df.drop(i, axis = 1, inplace = True)
df.isna().sum()
iso_code 0 continent 9956 location 0 date 0 total_cases 3033 new_cases 3193 new_cases_smoothed 5176 total_deaths 20875 total_cases_per_million 3791 new_cases_per_million 3951 new_cases_smoothed_per_million 5928 population 1075 population_density 18398 life_expectancy 11058 dtype: int64
df = df[df['continent'].notna()]
df.isna().sum()
iso_code 0 continent 0 location 0 date 0 total_cases 3026 new_cases 3193 new_cases_smoothed 5098 total_deaths 20662 total_cases_per_million 3026 new_cases_per_million 3193 new_cases_smoothed_per_million 5098 population 317 population_density 9216 life_expectancy 1876 dtype: int64
df['total_cases'].describe()
count 1.533440e+05 mean 6.444804e+05 std 3.202692e+06 min 1.000000e+00 25% 1.764000e+03 50% 2.042400e+04 75% 2.278665e+05 max 7.926573e+07 Name: total_cases, dtype: float64
df.boxplot(column = ['total_cases'])
print(len(df['total_cases']))
156370
q75, q25 = df['total_cases'].quantile(0.75), df['total_cases'].quantile(0.25)
iqr = q75 - q25
# Calculate the lower and upper bounds
lower_bound = q25 - (1.5 * iqr)
upper_bound = q75 + (1.5 * iqr)
df = df.drop(df[(df['total_cases'] < lower_bound) | (df['total_cases'] > upper_bound)].index)
df.boxplot(column = ['total_cases'])
print(len(df['total_cases']))
133613
df.isna().sum()
iso_code 0 continent 0 location 0 date 0 total_cases 3026 new_cases 3153 new_cases_smoothed 4926 total_deaths 20662 total_cases_per_million 3026 new_cases_per_million 3153 new_cases_smoothed_per_million 4926 population 317 population_density 9216 life_expectancy 1876 dtype: int64
df['total_cases'].describe()
count 130587.000000 mean 72515.788601 std 123395.731988 min 1.000000 25% 909.000000 50% 11358.000000 75% 84804.500000 max 566966.000000 Name: total_cases, dtype: float64
df['total_cases'] = df['total_cases'].fillna(df['total_cases'].mean())
df.isna().sum()
iso_code 0 continent 0 location 0 date 0 total_cases 0 new_cases 3153 new_cases_smoothed 4926 total_deaths 20662 total_cases_per_million 3026 new_cases_per_million 3153 new_cases_smoothed_per_million 4926 population 317 population_density 9216 life_expectancy 1876 dtype: int64
# dropping the smoothed values
df.drop('new_cases_smoothed', axis = 1, inplace = True)
df.drop('new_cases_smoothed_per_million', axis = 1, inplace = True)
df.isna().sum()
iso_code 0 continent 0 location 0 date 0 total_cases 0 new_cases 3153 total_deaths 20662 total_cases_per_million 3026 new_cases_per_million 3153 population 317 population_density 9216 life_expectancy 1876 dtype: int64
df['total_deaths'] = df['total_deaths'].fillna(0)
df.isna().sum()
iso_code 0 continent 0 location 0 date 0 total_cases 0 new_cases 3153 total_deaths 0 total_cases_per_million 3026 new_cases_per_million 3153 population 317 population_density 9216 life_expectancy 1876 dtype: int64
df = df[df['new_cases'].notna()]
df.isna().sum()
iso_code 0 continent 0 location 0 date 0 total_cases 0 new_cases 0 total_deaths 0 total_cases_per_million 0 new_cases_per_million 0 population 0 population_density 7779 life_expectancy 722 dtype: int64
df = df[df['population_density'].notna()]
df.isna().sum()
iso_code 0 continent 0 location 0 date 0 total_cases 0 new_cases 0 total_deaths 0 total_cases_per_million 0 new_cases_per_million 0 population 0 population_density 0 life_expectancy 722 dtype: int64
df = df[df['life_expectancy'].notna()]
df.isna().sum()
iso_code 0 continent 0 location 0 date 0 total_cases 0 new_cases 0 total_deaths 0 total_cases_per_million 0 new_cases_per_million 0 population 0 population_density 0 life_expectancy 0 dtype: int64
for i in df:
if i != 'iso_code' and i != 'continent' and i != 'location' and i != 'date':
q75, q25 = df[i].quantile(0.75), df[i].quantile(0.25)
iqr = q75 - q25
# Calculate the lower and upper bounds
lower_bound = q25 - (1.5 * iqr)
upper_bound = q75 + (1.5 * iqr)
df = df.drop(df[(df[i] < lower_bound) | (df[i] > upper_bound)].index)
df
| iso_code | continent | location | date | total_cases | new_cases | total_deaths | total_cases_per_million | new_cases_per_million | population | population_density | life_expectancy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AFG | Asia | Afghanistan | 2020-02-24 | 5.0 | 5.0 | 0.0 | 0.126 | 0.126 | 39835428.0 | 54.422 | 64.83 |
| 1 | AFG | Asia | Afghanistan | 2020-02-25 | 5.0 | 0.0 | 0.0 | 0.126 | 0.000 | 39835428.0 | 54.422 | 64.83 |
| 2 | AFG | Asia | Afghanistan | 2020-02-26 | 5.0 | 0.0 | 0.0 | 0.126 | 0.000 | 39835428.0 | 54.422 | 64.83 |
| 3 | AFG | Asia | Afghanistan | 2020-02-27 | 5.0 | 0.0 | 0.0 | 0.126 | 0.000 | 39835428.0 | 54.422 | 64.83 |
| 4 | AFG | Asia | Afghanistan | 2020-02-28 | 5.0 | 0.0 | 0.0 | 0.126 | 0.000 | 39835428.0 | 54.422 | 64.83 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 165892 | ZWE | Africa | Zimbabwe | 2020-12-27 | 13077.0 | 114.0 | 349.0 | 866.476 | 7.554 | 15092171.0 | 42.729 | 61.49 |
| 165893 | ZWE | Africa | Zimbabwe | 2020-12-28 | 13148.0 | 71.0 | 354.0 | 871.180 | 4.704 | 15092171.0 | 42.729 | 61.49 |
| 165894 | ZWE | Africa | Zimbabwe | 2020-12-29 | 13325.0 | 177.0 | 359.0 | 882.908 | 11.728 | 15092171.0 | 42.729 | 61.49 |
| 165896 | ZWE | Africa | Zimbabwe | 2020-12-31 | 13867.0 | 242.0 | 363.0 | 918.821 | 16.035 | 15092171.0 | 42.729 | 61.49 |
| 165897 | ZWE | Africa | Zimbabwe | 2021-01-01 | 14084.0 | 217.0 | 369.0 | 933.199 | 14.378 | 15092171.0 | 42.729 | 61.49 |
44369 rows × 12 columns
df.to_excel("New_data.xlsx", sheet_name = "newdata", index = False)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
df = pd.read_excel('New_data.xlsx')
df
| iso_code | continent | location | date | total_cases | new_cases | total_deaths | total_cases_per_million | new_cases_per_million | population | population_density | life_expectancy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AFG | Asia | Afghanistan | 2020-02-24 | 5 | 5 | 0 | 0.126 | 0.126 | 39835428 | 54.422 | 64.83 |
| 1 | AFG | Asia | Afghanistan | 2020-02-25 | 5 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 |
| 2 | AFG | Asia | Afghanistan | 2020-02-26 | 5 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 |
| 3 | AFG | Asia | Afghanistan | 2020-02-27 | 5 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 |
| 4 | AFG | Asia | Afghanistan | 2020-02-28 | 5 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 44364 | ZWE | Africa | Zimbabwe | 2020-12-27 | 13077 | 114 | 349 | 866.476 | 7.554 | 15092171 | 42.729 | 61.49 |
| 44365 | ZWE | Africa | Zimbabwe | 2020-12-28 | 13148 | 71 | 354 | 871.180 | 4.704 | 15092171 | 42.729 | 61.49 |
| 44366 | ZWE | Africa | Zimbabwe | 2020-12-29 | 13325 | 177 | 359 | 882.908 | 11.728 | 15092171 | 42.729 | 61.49 |
| 44367 | ZWE | Africa | Zimbabwe | 2020-12-31 | 13867 | 242 | 363 | 918.821 | 16.035 | 15092171 | 42.729 | 61.49 |
| 44368 | ZWE | Africa | Zimbabwe | 2021-01-01 | 14084 | 217 | 369 | 933.199 | 14.378 | 15092171 | 42.729 | 61.49 |
44369 rows × 12 columns
cases= df[df.continent == 'Africa']
cases['total_cases'].plot.hist(figsize=(15,10));
deathes= df[df.continent == 'Europe']
death=deathes['total_deaths']
minn = death.min()
maxx = death.max()
diff = (death.max() - death.min())
print('minimum = ', minn)
print('maximum = ', maxx)
print('difference = ', diff)
minimum = 0 maximum = 861 difference = 861
m = death.min()
l = (death.max() - death.min())//5 # length of each interval
g1 = death[death < m+l ] # add l to m; beacause we start from m not zero.
g2=death[(death>=m+l)&(death<m+2*l)]
g3=death[(death>=m+2*l)&(death<m+3*l)]
g4=death[(death>=m+3*l)&(death<m+4*l)]
g5=death[(death>=m+4*l)]
print('length of each interval', l)
print( len(g1))
print( len(g2))
print( len(g3))
print( len(g4))
print( len(g5))
length of each interval 172 3413 564 93 308 119
groups = {'G1':len(g1), 'G2':len(g2), 'G3':len(g3), 'G4':len(g4), 'G5':len(g5)}
groups_df = pd.DataFrame(groups.values(), columns=['count'], index=groups.keys())
groups_df
| count | |
|---|---|
| G1 | 3413 |
| G2 | 564 |
| G3 | 93 |
| G4 | 308 |
| G5 | 119 |
groups_df.plot(kind='barh');
mini = df.population_density.describe()[3]
maxx=df[df.population_density<=200000].population_density.describe()[7]
rang = (maxx - mini)/5
#x=df[(df.Income<=2*rang) & (df.Income>=rang)]
print(rang)
87.443
def Group(x):
r= 87.443
if (x >= 0) & (x < r) :
return 'g1'
elif (x >= r) & (x < 2*r):
return 'g2'
elif (x >= 2*r) & (x < 3*r):
return 'g3'
elif (x >= 3*r) & (x < 4*r):
return 'g4'
else:
return 'g5'
#df['groups']=df.Income.apply(lambda x: Group(x))
df['group'] = df.population_density.apply(lambda x: Group(x))
df.groupby('group')[['total_deaths', 'total_cases', 'life_expectancy']].mean().plot(kind='bar')
<AxesSubplot:xlabel='group'>
df[df.group=='g1']
| iso_code | continent | location | date | total_cases | new_cases | total_deaths | total_cases_per_million | new_cases_per_million | population | population_density | life_expectancy | group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AFG | Asia | Afghanistan | 2020-02-24 | 5 | 5 | 0 | 0.126 | 0.126 | 39835428 | 54.422 | 64.83 | g1 |
| 1 | AFG | Asia | Afghanistan | 2020-02-25 | 5 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 | g1 |
| 2 | AFG | Asia | Afghanistan | 2020-02-26 | 5 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 | g1 |
| 3 | AFG | Asia | Afghanistan | 2020-02-27 | 5 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 | g1 |
| 4 | AFG | Asia | Afghanistan | 2020-02-28 | 5 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 | g1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 44364 | ZWE | Africa | Zimbabwe | 2020-12-27 | 13077 | 114 | 349 | 866.476 | 7.554 | 15092171 | 42.729 | 61.49 | g1 |
| 44365 | ZWE | Africa | Zimbabwe | 2020-12-28 | 13148 | 71 | 354 | 871.180 | 4.704 | 15092171 | 42.729 | 61.49 | g1 |
| 44366 | ZWE | Africa | Zimbabwe | 2020-12-29 | 13325 | 177 | 359 | 882.908 | 11.728 | 15092171 | 42.729 | 61.49 | g1 |
| 44367 | ZWE | Africa | Zimbabwe | 2020-12-31 | 13867 | 242 | 363 | 918.821 | 16.035 | 15092171 | 42.729 | 61.49 | g1 |
| 44368 | ZWE | Africa | Zimbabwe | 2021-01-01 | 14084 | 217 | 369 | 933.199 | 14.378 | 15092171 | 42.729 | 61.49 | g1 |
28567 rows × 13 columns
corr_matrix = df.corr()
corr_matrix
| total_cases | new_cases | total_deaths | total_cases_per_million | new_cases_per_million | population | population_density | life_expectancy | cluster | |
|---|---|---|---|---|---|---|---|---|---|
| total_cases | 1.000000 | 0.422932 | 0.754284 | 0.260862 | 0.207327 | 0.413564 | -0.051436 | -0.219713 | -0.187709 |
| new_cases | 0.422932 | 1.000000 | 0.339806 | -0.052824 | 0.614576 | 0.469877 | -0.043941 | -0.028044 | -0.171167 |
| total_deaths | 0.754284 | 0.339806 | 1.000000 | 0.245181 | 0.219265 | 0.402440 | -0.076910 | -0.257147 | -0.151353 |
| total_cases_per_million | 0.260862 | -0.052824 | 0.245181 | 1.000000 | 0.113763 | -0.238132 | 0.035096 | 0.030257 | -0.033476 |
| new_cases_per_million | 0.207327 | 0.614576 | 0.219265 | 0.113763 | 1.000000 | 0.067308 | -0.007698 | 0.089054 | 0.011334 |
| population | 0.413564 | 0.469877 | 0.402440 | -0.238132 | 0.067308 | 1.000000 | -0.145466 | -0.300713 | -0.363063 |
| population_density | -0.051436 | -0.043941 | -0.076910 | 0.035096 | -0.007698 | -0.145466 | 1.000000 | 0.126993 | 0.140186 |
| life_expectancy | -0.219713 | -0.028044 | -0.257147 | 0.030257 | 0.089054 | -0.300713 | 0.126993 | 1.000000 | -0.041911 |
| cluster | -0.187709 | -0.171167 | -0.151353 | -0.033476 | 0.011334 | -0.363063 | 0.140186 | -0.041911 | 1.000000 |
plt.figure(figsize=(10, 10))
sns.heatmap(df.corr(), annot=True)
plt.show()
df
| iso_code | continent | location | date | total_cases | new_cases | total_deaths | total_cases_per_million | new_cases_per_million | population | population_density | life_expectancy | group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AFG | Asia | Afghanistan | 2020-02-24 | 5 | 5 | 0 | 0.126 | 0.126 | 39835428 | 54.422 | 64.83 | g1 |
| 1 | AFG | Asia | Afghanistan | 2020-02-25 | 5 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 | g1 |
| 2 | AFG | Asia | Afghanistan | 2020-02-26 | 5 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 | g1 |
| 3 | AFG | Asia | Afghanistan | 2020-02-27 | 5 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 | g1 |
| 4 | AFG | Asia | Afghanistan | 2020-02-28 | 5 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 | g1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 44364 | ZWE | Africa | Zimbabwe | 2020-12-27 | 13077 | 114 | 349 | 866.476 | 7.554 | 15092171 | 42.729 | 61.49 | g1 |
| 44365 | ZWE | Africa | Zimbabwe | 2020-12-28 | 13148 | 71 | 354 | 871.180 | 4.704 | 15092171 | 42.729 | 61.49 | g1 |
| 44366 | ZWE | Africa | Zimbabwe | 2020-12-29 | 13325 | 177 | 359 | 882.908 | 11.728 | 15092171 | 42.729 | 61.49 | g1 |
| 44367 | ZWE | Africa | Zimbabwe | 2020-12-31 | 13867 | 242 | 363 | 918.821 | 16.035 | 15092171 | 42.729 | 61.49 | g1 |
| 44368 | ZWE | Africa | Zimbabwe | 2021-01-01 | 14084 | 217 | 369 | 933.199 | 14.378 | 15092171 | 42.729 | 61.49 | g1 |
44369 rows × 13 columns
df_A = df[df['continent']=='Asia']
df_E = df[df['continent']=='Africa']
df_A['life_expectancy'].hist(alpha=0.5);
df_E['life_expectancy'].hist(alpha=0.5);
df_A.hist(figsize=(15,15));
df_E.hist(figsize=(15,15));
plt.figure(figsize=(15,5))
sns.barplot(x='continent', y='total_deaths', data=df)
plt.show()
max_index = df['total_cases'].idxmax()
max_location = df.loc[max_index, 'location']
max_value=df['total_cases'].max()
print(max_location + ' has the maximum number of total cases, with ' + str(max_value) + ' cases.')
Uzbekistan has the maximum number of total cases, with 116421 cases.
af= df[df.continent == 'South America']
plt.figure(figsize=(15,5))
sns.barplot(x='location', y='total_deaths', data=af)
plt.show()
grouped_data = df.groupby('continent')
max_cases = grouped_data.max()['total_cases']
max_cases = max_cases.sort_values(ascending=False)
max_cases
continent Asia 116421 Africa 106750 South America 98665 Europe 79852 North America 72049 Oceania 71122 Name: total_cases, dtype: int64
df
| iso_code | continent | location | date | total_cases | new_cases | total_deaths | total_cases_per_million | new_cases_per_million | population | population_density | life_expectancy | group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AFG | Asia | Afghanistan | 2020-02-24 | 5 | 5 | 0 | 0.126 | 0.126 | 39835428 | 54.422 | 64.83 | g1 |
| 1 | AFG | Asia | Afghanistan | 2020-02-25 | 5 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 | g1 |
| 2 | AFG | Asia | Afghanistan | 2020-02-26 | 5 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 | g1 |
| 3 | AFG | Asia | Afghanistan | 2020-02-27 | 5 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 | g1 |
| 4 | AFG | Asia | Afghanistan | 2020-02-28 | 5 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 | g1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 44364 | ZWE | Africa | Zimbabwe | 2020-12-27 | 13077 | 114 | 349 | 866.476 | 7.554 | 15092171 | 42.729 | 61.49 | g1 |
| 44365 | ZWE | Africa | Zimbabwe | 2020-12-28 | 13148 | 71 | 354 | 871.180 | 4.704 | 15092171 | 42.729 | 61.49 | g1 |
| 44366 | ZWE | Africa | Zimbabwe | 2020-12-29 | 13325 | 177 | 359 | 882.908 | 11.728 | 15092171 | 42.729 | 61.49 | g1 |
| 44367 | ZWE | Africa | Zimbabwe | 2020-12-31 | 13867 | 242 | 363 | 918.821 | 16.035 | 15092171 | 42.729 | 61.49 | g1 |
| 44368 | ZWE | Africa | Zimbabwe | 2021-01-01 | 14084 | 217 | 369 | 933.199 | 14.378 | 15092171 | 42.729 | 61.49 | g1 |
44369 rows × 13 columns
max_indexs = df['new_cases'].idxmax()
max_locations = df.loc[max_indexs, 'location']
max_values=df['new_cases'].max()
print(max_location + ' has the maximum number of total cases, with ' + str(max_values) + ' cases.')
Uzbekistan has the maximum number of total cases, with 472 cases.
afg= df[df.location == 'Peru']
X = afg['new_cases'].values
Y = afg['total_deaths'].values
a = sum([(x-X.mean()) * (y-Y.mean()) for x,y in zip(X,Y)]) /sum([(x-X.mean())**2 for x in X])
b= np.mean(Y)-a*np.mean(X)
a
1.0328387466255473
b
-1.0761540027249623
def y_hat(x):
a=1.0328387466255473
b=-1.0761540027249623
return a*x+b
yhat = lambda x: a * x + b #prediction
plt.scatter(X,Y)
plt.plot([X.min(), X.max()], [yhat(X.min()), yhat(X.max())], color = 'g')
plt.scatter(X.mean(), Y.mean(), marker = 'D', c = 'r')
<matplotlib.collections.PathCollection at 0x2304a8d3190>
Yhat = np.array([yhat(x) for x in X])
TSS = sum([(y-np.mean(Y))**2 for y in Y])
RSS = sum([(y-y_hat(x))**2 for x,y in zip (X,Y)])
R2 = 1 - RSS/TSS
print(f'TSS is: {TSS}, RSS is: {RSS}, R2 is: {R2}')
TSS is: 356028.0, RSS is: 60733.20530727628, R2 is: 0.8294145255224974
sorted_counts = af['population_density'].value_counts()#south america
gen_percentage = sorted_counts/sorted_counts[:].sum()
plt.pie(gen_percentage, labels = sorted_counts.index, startangle = 200, counterclock = False);
import seaborn as sns
chad= df[df.location == 'Chad']
t = pd.DataFrame(af['new_cases'].value_counts().reset_index())
t.columns = ['date', 'new_cases']
plt.figure(figsize=(18,5))
sns.pointplot(x='date', y='new_cases', data=chad)
<AxesSubplot:xlabel='date', ylabel='new_cases'>
def calculate_w(x,y):
## write your code
x_T=x.T
x_T_x=x_T @ x #dot product
x_T_x_inv=np.linalg.pinv(x_T_x)
x_T_y= x_T @ y
w= x_T_x_inv @ x_T_y
return w
def polynomial(x, degree):
x_poly = []
## write your code
for i in range (1,degree+1):
x_poly.append(x**i)
x_poly.append(np.ones(len(x)))
#return x_poly
return np.array(x_poly).T
degree=2
x_poly=polynomial(X,degree)
w=calculate_w(x_poly,Y)
w
array([9.23560843e-01, 3.49167594e-04, 2.16881279e+00])
df
| iso_code | continent | location | date | total_cases | new_cases | total_deaths | total_cases_per_million | new_cases_per_million | population | population_density | life_expectancy | group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AFG | Asia | Afghanistan | 2020-02-24 | 5 | 5 | 0 | 0.126 | 0.126 | 39835428 | 54.422 | 64.83 | g1 |
| 1 | AFG | Asia | Afghanistan | 2020-02-25 | 5 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 | g1 |
| 2 | AFG | Asia | Afghanistan | 2020-02-26 | 5 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 | g1 |
| 3 | AFG | Asia | Afghanistan | 2020-02-27 | 5 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 | g1 |
| 4 | AFG | Asia | Afghanistan | 2020-02-28 | 5 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 | g1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 44364 | ZWE | Africa | Zimbabwe | 2020-12-27 | 13077 | 114 | 349 | 866.476 | 7.554 | 15092171 | 42.729 | 61.49 | g1 |
| 44365 | ZWE | Africa | Zimbabwe | 2020-12-28 | 13148 | 71 | 354 | 871.180 | 4.704 | 15092171 | 42.729 | 61.49 | g1 |
| 44366 | ZWE | Africa | Zimbabwe | 2020-12-29 | 13325 | 177 | 359 | 882.908 | 11.728 | 15092171 | 42.729 | 61.49 | g1 |
| 44367 | ZWE | Africa | Zimbabwe | 2020-12-31 | 13867 | 242 | 363 | 918.821 | 16.035 | 15092171 | 42.729 | 61.49 | g1 |
| 44368 | ZWE | Africa | Zimbabwe | 2021-01-01 | 14084 | 217 | 369 | 933.199 | 14.378 | 15092171 | 42.729 | 61.49 | g1 |
44369 rows × 13 columns
a= df[df.location == 'Angola']
X = a.loc[:, ['new_cases']]
y = a.loc[:, ['total_deaths']]
X
| new_cases | |
|---|---|
| 315 | 1 |
| 316 | 1 |
| 317 | 0 |
| 318 | 1 |
| 319 | 0 |
| ... | ... |
| 768 | 173 |
| 769 | 195 |
| 770 | 178 |
| 771 | 137 |
| 772 | 74 |
458 rows × 1 columns
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size = .75)
w1=calculate_w(X_train,y_train)
y_pred = np.matmul(X_test.values, w1)
plt.scatter(X_train.values, y_train.values,color='g')
plt.plot(X_test.values, y_pred.values,color='k')
plt.show()
TSS = sum([(y-np.mean(y_test.values))**2 for y in y_test.values])
RSS = sum([(yt-yp)**2 for yt,yp in zip(y_test.values,y_pred.values)])
R2 = 1 - RSS/TSS
R2[0]
0.11374681205089965
from sklearn.metrics import r2_score
r2_score(y_test.values, y_pred)
0.1137468120509002
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
a=df['total_cases'].values.reshape(-1, 1)
b=df['total_deaths'].values.reshape(-1, 1)
a_train,a_test,b_train,b_test = train_test_split(a,b,test_size=0.25)
regr = LinearRegression()
regr.fit(a_train,b_train)
b_pred = regr.predict(a_test)
print("The MSE will be: ",mean_squared_error(b_test,b_pred))
print("The R squared error will be: ",r2_score(b_test,b_pred))
The MSE will be: 13259.82900306039 The R squared error will be: 0.5741152546860264
plt.scatter(a_train, b_train, color='orange')
plt.scatter(a_test, b_pred, color='red')
plt.xlabel('total_cases')
plt.ylabel('total_Deaths')
plt.show()
df2=df_A.loc[:,'new_cases':'life_expectancy']
df2
| new_cases | total_deaths | total_cases_per_million | new_cases_per_million | population | population_density | life_expectancy | |
|---|---|---|---|---|---|---|---|
| 0 | 5 | 0 | 0.126 | 0.126 | 39835428 | 54.422 | 64.83 |
| 1 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 |
| 2 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 |
| 3 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 |
| 4 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 43801 | 113 | 810 | 125.153 | 3.706 | 30490639 | 53.508 | 66.12 |
| 43802 | 84 | 820 | 127.908 | 2.755 | 30490639 | 53.508 | 66.12 |
| 43803 | 69 | 832 | 130.171 | 2.263 | 30490639 | 53.508 | 66.12 |
| 43804 | 64 | 851 | 132.270 | 2.099 | 30490639 | 53.508 | 66.12 |
| 43805 | 82 | 863 | 134.959 | 2.689 | 30490639 | 53.508 | 66.12 |
6431 rows × 7 columns
df_A
| iso_code | continent | location | date | total_cases | new_cases | total_deaths | total_cases_per_million | new_cases_per_million | population | population_density | life_expectancy | group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AFG | Asia | Afghanistan | 2020-02-24 | 5 | 5 | 0 | 0.126 | 0.126 | 39835428 | 54.422 | 64.83 | g1 |
| 1 | AFG | Asia | Afghanistan | 2020-02-25 | 5 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 | g1 |
| 2 | AFG | Asia | Afghanistan | 2020-02-26 | 5 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 | g1 |
| 3 | AFG | Asia | Afghanistan | 2020-02-27 | 5 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 | g1 |
| 4 | AFG | Asia | Afghanistan | 2020-02-28 | 5 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 | g1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 43801 | YEM | Asia | Yemen | 2021-03-25 | 3816 | 113 | 810 | 125.153 | 3.706 | 30490639 | 53.508 | 66.12 | g1 |
| 43802 | YEM | Asia | Yemen | 2021-03-26 | 3900 | 84 | 820 | 127.908 | 2.755 | 30490639 | 53.508 | 66.12 | g1 |
| 43803 | YEM | Asia | Yemen | 2021-03-27 | 3969 | 69 | 832 | 130.171 | 2.263 | 30490639 | 53.508 | 66.12 | g1 |
| 43804 | YEM | Asia | Yemen | 2021-03-28 | 4033 | 64 | 851 | 132.270 | 2.099 | 30490639 | 53.508 | 66.12 | g1 |
| 43805 | YEM | Asia | Yemen | 2021-03-29 | 4115 | 82 | 863 | 134.959 | 2.689 | 30490639 | 53.508 | 66.12 | g1 |
6431 rows × 13 columns
df_africa = df_A.groupby(["date"])[["total_cases","new_cases","total_deaths"]].sum().reset_index().sort_values("date",ascending=True).reset_index(drop=True)
df_africa
| date | total_cases | new_cases | total_deaths | |
|---|---|---|---|---|
| 0 | 2020-01-25 | 5 | 5 | 0 |
| 1 | 2020-01-26 | 5 | 0 | 0 |
| 2 | 2020-01-27 | 7 | 2 | 0 |
| 3 | 2020-01-28 | 7 | 0 | 0 |
| 4 | 2020-01-29 | 14 | 7 | 0 |
| ... | ... | ... | ... | ... |
| 766 | 2022-03-01 | 17786 | 0 | 125 |
| 767 | 2022-03-02 | 17786 | 0 | 125 |
| 768 | 2022-03-03 | 17786 | 0 | 125 |
| 769 | 2022-03-04 | 17786 | 0 | 125 |
| 770 | 2022-03-05 | 17786 | 0 | 125 |
771 rows × 4 columns
plt.figure(figsize=(20,10))
plt.plot(df_africa['date'], df_africa['total_cases'],marker='o'
,c='r',ls='--',markersize=10)
plt.title('Evolution of Confirmed Covid-19 cases over time in Africa', fontsize=16)
plt.xlabel('Days', fontsize=16)
plt.ylabel('Confirmed cases', fontsize=16)
Text(0, 0.5, 'Confirmed cases')
plt.figure(figsize=(20,10))
plt.plot(df_africa.index, df_africa['total_cases'])
plt.title('Evolution of Confirmed Covid-19 cases over time in Africa', fontsize=16)
plt.xlabel('Days', fontsize=16)
plt.ylabel('Confirmed cases', fontsize=16)
Text(0, 0.5, 'Confirmed cases')
plt.figure(figsize=(20,10))
plt.plot(df_africa.index, df_africa['total_deaths'])
plt.title('Evolution of Covid-19 Deaths cases over time in Africa', fontsize=16)
plt.xlabel('Days', fontsize=16)
plt.ylabel('Number of Deaths', fontsize=16)
Text(0, 0.5, 'Number of Deaths')
#during 20 days
plt.figure(figsize=(20,10))
plt.bar(df_africa['date'].head(20), df_africa['new_cases'].head(20))
plt.title('Evolution of Covid-19 New Cases over time in Africa', fontsize=16)
plt.xlabel('Days', fontsize=16)
plt.ylabel('New Cases', fontsize=16)
Text(0, 0.5, 'New Cases')
import datetime
df_africa['date'] = pd.to_datetime(df_africa['date'], errors='coerce')
df_africa['Month'] = df_africa['date'].dt.month
print(df_africa)
date total_cases new_cases total_deaths Month 0 2020-01-25 5 5 0 1 1 2020-01-26 5 0 0 1 2 2020-01-27 7 2 0 1 3 2020-01-28 7 0 0 1 4 2020-01-29 14 7 0 1 .. ... ... ... ... ... 766 2022-03-01 17786 0 125 3 767 2022-03-02 17786 0 125 3 768 2022-03-03 17786 0 125 3 769 2022-03-04 17786 0 125 3 770 2022-03-05 17786 0 125 3 [771 rows x 5 columns]
plt.figure(figsize=(20,10))
plt.bar(df_africa['Month'], df_africa['new_cases'])
plt.title('Evolution of Covid-19 New Cases over time in Africa', fontsize=16)
plt.xlabel('Months', fontsize=16)
plt.ylabel('New Cases', fontsize=16)
Text(0, 0.5, 'New Cases')
df_tunisia = df[df['location'] == 'Tunisia']
df_tunisia
| iso_code | continent | location | date | total_cases | new_cases | total_deaths | total_cases_per_million | new_cases_per_million | population | population_density | life_expectancy | group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 41788 | TUN | Africa | Tunisia | 2020-03-04 | 1 | 1 | 0 | 0.084 | 0.084 | 11935764 | 74.228 | 76.7 | g1 |
| 41789 | TUN | Africa | Tunisia | 2020-03-05 | 1 | 0 | 0 | 0.084 | 0.000 | 11935764 | 74.228 | 76.7 | g1 |
| 41790 | TUN | Africa | Tunisia | 2020-03-06 | 1 | 0 | 0 | 0.084 | 0.000 | 11935764 | 74.228 | 76.7 | g1 |
| 41791 | TUN | Africa | Tunisia | 2020-03-07 | 1 | 0 | 0 | 0.084 | 0.000 | 11935764 | 74.228 | 76.7 | g1 |
| 41792 | TUN | Africa | Tunisia | 2020-03-08 | 2 | 1 | 0 | 0.168 | 0.084 | 11935764 | 74.228 | 76.7 | g1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 41985 | TUN | Africa | Tunisia | 2020-10-15 | 34790 | 0 | 512 | 2914.769 | 0.000 | 11935764 | 74.228 | 76.7 | g1 |
| 41986 | TUN | Africa | Tunisia | 2020-10-16 | 34790 | 0 | 512 | 2914.769 | 0.000 | 11935764 | 74.228 | 76.7 | g1 |
| 41987 | TUN | Africa | Tunisia | 2020-10-18 | 40542 | 0 | 626 | 3396.682 | 0.000 | 11935764 | 74.228 | 76.7 | g1 |
| 41988 | TUN | Africa | Tunisia | 2020-10-22 | 45892 | 0 | 740 | 3844.915 | 0.000 | 11935764 | 74.228 | 76.7 | g1 |
| 41989 | TUN | Africa | Tunisia | 2020-10-25 | 48799 | 0 | 819 | 4088.469 | 0.000 | 11935764 | 74.228 | 76.7 | g1 |
202 rows × 13 columns
plt.figure(figsize=(20,10))
plt.plot(df_tunisia.index, df_tunisia['new_cases'])
df_tunisia[df_tunisia['date'] == '2021-01-03']
| iso_code | continent | location | date | total_cases | new_cases | total_deaths | total_cases_per_million | new_cases_per_million | population | population_density | life_expectancy | group |
|---|
df2=df_A.loc[:,'new_cases':'life_expectancy']
df2
| new_cases | total_deaths | total_cases_per_million | new_cases_per_million | population | population_density | life_expectancy | |
|---|---|---|---|---|---|---|---|
| 0 | 5 | 0 | 0.126 | 0.126 | 39835428 | 54.422 | 64.83 |
| 1 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 |
| 2 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 |
| 3 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 |
| 4 | 0 | 0 | 0.126 | 0.000 | 39835428 | 54.422 | 64.83 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 43801 | 113 | 810 | 125.153 | 3.706 | 30490639 | 53.508 | 66.12 |
| 43802 | 84 | 820 | 127.908 | 2.755 | 30490639 | 53.508 | 66.12 |
| 43803 | 69 | 832 | 130.171 | 2.263 | 30490639 | 53.508 | 66.12 |
| 43804 | 64 | 851 | 132.270 | 2.099 | 30490639 | 53.508 | 66.12 |
| 43805 | 82 | 863 | 134.959 | 2.689 | 30490639 | 53.508 | 66.12 |
6431 rows × 7 columns
import scipy.cluster.hierarchy as sch
# graph size
plt.figure(1, figsize = (16 ,8))
# creating the dendrogram
dendrogram = sch.dendrogram(sch.linkage(df2, method = "ward"))
# ploting graphabs
plt.title('Dendrogram')
plt.xlabel('Customers')
plt.ylabel('Euclidean distances')
plt.show()
from sklearn.cluster import AgglomerativeClustering
#from yellowbrick.cluster import KElbowVisualizer
#model = AgglomerativeClustering() # aglomerative
#k is range of number of clusters.
#visualizer = KElbowVisualizer(model, k=(2,30), timings=False) # by7seb el inertia
#Fit data to visualizer
#visualizer.fit(df2)
#Finalize and rend;er figure
#visualizer.show();
# best numbers of k of clusters
import scipy.cluster.hierarchy as sch
# size of image
plt.figure(1, figsize = (16 ,8))
plt.grid(b=None)
# creating the dendrogram
dend = sch.dendrogram(sch.linkage(df2, method='ward'))
# theroshold
plt.axhline(y=0.2, color='orange')
# ploting graphabs
plt.title('Dendrogram')
plt.xlabel('Customers')
plt.ylabel('Euclidean distances')
plt.show();
C:\Users\dell\AppData\Local\Temp\ipykernel_4944\2204434934.py:5: MatplotlibDeprecationWarning: The 'b' parameter of grid() has been renamed 'visible' since Matplotlib 3.5; support for the old name will be dropped two minor releases later. plt.grid(b=None)
from sklearn.cluster import AgglomerativeClustering
import plotly as plt
import plotly.graph_objects as go
# calling the agglomerative algorithm
model = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage ='average')
#distance_threshold = distance condition for clustering
# training the model on dataset
y_model = model.fit_predict(df2) # label of clusters for each one
# creating pandas dataframe
df['cluster'] = pd.DataFrame(y_model)
# creating scattered graph
trace1 = go.Scatter3d(
# storing the variables in x, y, and z axis
hovertext=df['cluster'],
x= df['date'],
y= df['total_cases'],
z= df['total_deaths'],
mode='markers',
marker=dict(
color = df['cluster'],
size= 3,
line=dict(
color= df['cluster'],
width= 12
),
opacity=0.9
)
)
# ploting graph
data = [trace1]
layout = go.Layout(
title= 'Clusters using Agglomerative Clustering',
scene = dict(
xaxis = dict(title = 'date'),
yaxis = dict(title = ' total_cases'),
zaxis = dict(title = 'total_deaths')
),
width=1024, height=512
)
fig = go.Figure(data=data, layout=layout)
plt.offline.iplot(fig)
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
X = df[['total_cases', 'total_deaths', 'population']].values
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
predictions = kmeans.predict(X)
df['cluster'] = predictions
%matplotlib notebook
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df['total_cases'], df['total_deaths'], df['population'], c=df['cluster'])
ax.set_xlabel('Total cases')
ax.set_ylabel('Total deaths')
ax.set_zlabel('Population')
plt.show()